#load necessary packages

library(rvest)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr) #für word count
library(ggplot2)
library(legislatoR)

Data import

After acquiring the data from the CLD containing the “wikititle”, I can use this to scrape the Wikipedia articles. I subsetted the data, only focusing on politicians that are still alive (death = NA) as I consider those politicians more relevant for my analysis, instead of including also rather historical figures.

#import raw data 

#raw data was acquired using the following code (using France as an example)
# fr_core <-  get_core((legislature = "fra"))
# fr_core_alive <- fr_core %>% filter(is.na(death))

###text aquisition german (using german as example)

# de_text_pipeline <- function(page_name) {
#   Sys.sleep(runif(1, 1, 2))
#   
#   # Check if page_name is missing
#   if (is.na(page_name) || page_name == "") {
#     return("No Wikipedia page name provided or missing.")
#   }
#   
#   # Try fetching Wikipedia content
#   tryCatch({
#     wp_content <- WikipediR::page_content("de", "wikipedia", page_name = page_name)
#     plain_text <- html_text(read_html(wp_content$parse$text$`*`))
#     return(plain_text)
#   }, error = function(e) {
#     return(paste("Error fetching content for page:", page_name))
#   })
# }

#read in data

cze_alive_text <- read.csv("raw_data/cze_alive_text.csv")

deu_alive_text <- read.csv("raw_data/deu_alive_text.csv")

fr_alive_text <- read.csv("raw_data/fr_alive_text.csv")

usa_alive_text <- read.csv("raw_data/usa_alive_text.csv")

gbr_alive_text <- read.csv("raw_data/gbr_alive_text.csv")

sco_alive_text <- read.csv("raw_data/sco_alive_text.csv")

irl_alive_text <- read.csv("raw_data/irl_alive_text.csv")

esp_alive_text <- read.csv("raw_data/esp_alive_text.csv")

can_alive_text <- read.csv("raw_data/can_alive_text.csv")

aut_alive_text <- read.csv("raw_data/aut_alive_text.csv") 

Functions

# Necessary functions for the following data preprocessing

clean_data <- function(df) {
  initial_rows <- nrow(df)
  
  # Remove CSS-like structures
  df$plain_text <- str_remove_all(df$plain_text, "\\..*?\\{.*?\\}")
  
  # Initialize counters for removal reasons
  removal_reason_redirect <- sum(grepl("^(Redirect to:|Weiterleitung nach:|Rediriger vers:|Redirige a:|Přesměrování na:)", df$plain_text, ignore.case = TRUE))
  removal_reason_refering_page <- sum(grepl("may refer to:|ist der Name folgender Personen:|Cette page d'homonymie répertorie différentes personnes|může být:", df$plain_text, ignore.case = TRUE))
  removal_reason_not_found <- sum(grepl("^(Error fetching content for page:|No Wikipedia page name provided or missing|Es wurde kein Wikipedia-Seitenname angegeben)", df$plain_text, ignore.case = TRUE))
  
  
  # Filter rows based on condition
  df <- df %>%
    filter(!grepl("^(Redirect to:|Weiterleitung nach:|Rediriger vers:|Redirige a:|Přesměrování na:)", plain_text, ignore.case = TRUE) &
             !grepl("may refer to:|ist der Name folgender Personen:|Cette page d'homonymie répertorie différentes personnes|může být:", plain_text, ignore.case = TRUE) &
             !grepl("Error fetching content for page:|No Wikipedia page name provided or missing|Es wurde kein Wikipedia-Seitenname angegeben", plain_text, ignore.case = TRUE))
  
  # Calculate the number of rows removed
  rows_removed <- initial_rows - nrow(df)
  
  # Print removal reasons
  cat("Removal reasons:\n")
  cat("  - Redirect:", removal_reason_redirect, "\n")
  cat("  - Reference Page:", removal_reason_refering_page, "\n")
  cat("  - Not Found/no name_provided:", removal_reason_not_found, "\n")
  
  
  cat("Cleaned data: Removed", rows_removed, "rows.\n")
  
  # Return the cleaned data frame
  return(df)
}


traffic_metrics <- function(traffic_data) {
  # Format the date
  traffic_data$date <- format(traffic_data$date, "%Y-%m")
  
  # Total per politician
  total_traffic_per_politician <- traffic_data %>%
    group_by(pageid) %>%
    summarise(total_traffic = sum(traffic))
  
  # Average per month per politician
  average_traffic_per_politician <- total_traffic_per_politician %>%
    mutate(average_traffic = total_traffic / n_distinct(traffic_data$date))
  
  # Convert pageid to numeric
  average_traffic_per_politician$pageid <- as.numeric(average_traffic_per_politician$pageid)
  
  # Return the result
  return(average_traffic_per_politician)
}


count_words <- function(text) {
  words <- str_extract_all(text, "\\b\\w+\\b")[[1]]
  return(length(words))
}

the function to clean the data removes unreadable parts of the html format and leaves us with human readable text of the politician’s Wikipedia article. Further, it removes datapoints that didn’t succesfully retreive an article for reasons of redirects (name changes), missing Wikipedia pages ( no “wiki_title”) or references pages (“may refer to…”).

Data cleaning

cze <- clean_data(cze_alive_text)
## Removal reasons:
##   - Redirect: 9 
##   - Reference Page: 3 
##   - Not Found/no name_provided: 1 
## Cleaned data: Removed 13 rows.
fra <- clean_data(fr_alive_text)
## Removal reasons:
##   - Redirect: 0 
##   - Reference Page: 6 
##   - Not Found/no name_provided: 1 
## Cleaned data: Removed 7 rows.
deu <- clean_data(deu_alive_text)
## Removal reasons:
##   - Redirect: 11 
##   - Reference Page: 27 
##   - Not Found/no name_provided: 4 
## Cleaned data: Removed 42 rows.
usa <- clean_data(usa_alive_text)
## Removal reasons:
##   - Redirect: 103 
##   - Reference Page: 14 
##   - Not Found/no name_provided: 0 
## Cleaned data: Removed 117 rows.
gbr <- clean_data(gbr_alive_text)
## Removal reasons:
##   - Redirect: 55 
##   - Reference Page: 19 
##   - Not Found/no name_provided: 1598 
## Cleaned data: Removed 1672 rows.
irl <- clean_data(irl_alive_text)
## Removal reasons:
##   - Redirect: 19 
##   - Reference Page: 19 
##   - Not Found/no name_provided: 0 
## Cleaned data: Removed 38 rows.
sco <- clean_data(sco_alive_text)
## Removal reasons:
##   - Redirect: 3 
##   - Reference Page: 1 
##   - Not Found/no name_provided: 0 
## Cleaned data: Removed 4 rows.
esp <- clean_data(esp_alive_text)
## Removal reasons:
##   - Redirect: 28 
##   - Reference Page: 0 
##   - Not Found/no name_provided: 1057 
## Cleaned data: Removed 1085 rows.
aut <- clean_data(aut_alive_text)
## Removal reasons:
##   - Redirect: 7 
##   - Reference Page: 14 
##   - Not Found/no name_provided: 5 
## Cleaned data: Removed 26 rows.
can <- clean_data(can_alive_text)
## Removal reasons:
##   - Redirect: 25 
##   - Reference Page: 13 
##   - Not Found/no name_provided: 0 
## Cleaned data: Removed 38 rows.

I am using “wikititle” as variable to use the API, which leads to some problems with redirects, when politicians changed their name after the creation of the CLD. Unfortunately, scraping via the “pageid” did not work out. The number of redirects seems still acceptable for me. A bigger problem is raised in the british and spanish data, where a lot of missing data occurs because of the page not being found. This is due to missing “wikititle” and “pageid”, which means that these politicians don’t have a unique Wikipedia page or Wikidata ID. To be discussed if this poses a problem for the further analysis of these countries’ politicians.

Data exploration

#combine all data in one df

all_countries <- rbind(deu, cze, fra, usa, sco, irl, can, aut, esp, gbr)
all_countries <- all_countries%>%
  filter(!is.na(sex))

First, let’s have a look at the dataset, containing all countries and see how it is setup.

head(all_countries)
summary(all_countries)
##    country             pageid           wikidataid         wikititle        
##  Length:14064       Length:14064       Length:14064       Length:14064      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      name               sex             ethnicity           religion        
##  Length:14064       Length:14064       Length:14064       Length:14064      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##     birth            death          birthplace         deathplace       
##  Length:14064       Mode:logical   Length:14064       Length:14064      
##  Class :character   NA's:14064     Class :character   Class :character  
##  Mode  :character                  Mode  :character   Mode  :character  
##   plain_text       
##  Length:14064      
##  Class :character  
##  Mode  :character
str(all_countries)
## 'data.frame':    14064 obs. of  13 variables:
##  $ country   : chr  "DEU" "DEU" "DEU" "DEU" ...
##  $ pageid    : chr  "174000" "9980355" "5166669" "261258" ...
##  $ wikidataid: chr  "Q340387" "Q39678866" "Q340448" "Q354647" ...
##  $ wikititle : chr  "Achim_Großmann" "Achim_Kessler" "Achim_Post" "Adelheid_Tröscher" ...
##  $ name      : chr  "Achim Großmann" "Achim Kessler" "Achim Post" "Adelheid D. Tröscher" ...
##  $ sex       : chr  "male" "male" "male" "female" ...
##  $ ethnicity : chr  NA "white" "white" NA ...
##  $ religion  : chr  "catholicism" NA "protestantism lutheran" NA ...
##  $ birth     : chr  "1947-04-17" "1964-08-02" "1959-05-02" "1939-02-16" ...
##  $ death     : logi  NA NA NA NA NA NA ...
##  $ birthplace: chr  "50.77621,6.08379" "48.12472,8.33083" "52.41667,8.61667" "52.51667,13.38333" ...
##  $ deathplace: chr  NA NA NA NA ...
##  $ plain_text: chr  "Achim Großmann (* 17. April 1947 in Aachen; † 14. April 2023 in Würselen[1]) war ein deutscher Politiker (SPD)."| __truncated__ "Achim Kessler (2019)Achim Dieter Kessler (* 2. August 1964[1] in St. Georgen im Schwarzwald) ist ein deutscher "| __truncated__ "Achim Post (2018)Achim Post (* 2. Mai 1959 in Rahden) ist ein deutscher Politiker (SPD). Er ist seit dem 26. Au"| __truncated__ "Adelheid D. Tröscher (* 16. Februar 1939 in Berlin) ist eine deutsche Pädagogin und Politikerin (SPD). Sie war "| __truncated__ ...
#Plot the number of female/male politicians per country
ggplot(all_countries, aes(x = sex, fill = sex)) +
  geom_bar() +
  facet_wrap(~country, scales = "free_y") +
  labs(title = "Number of male/female politicians per country") +
  xlab("sex") +
  ylab("number") +
  scale_fill_manual(values = c("male" = "blue", "female" = "pink")) +
  theme_minimal() +
  theme(legend.title = element_blank())

We can see, that female politicians, as it can be expected, are underrepresented in all countries. Still, it leaves us with a decent number of female politicians to be compared to the male politicians.

Next, we want to have a look at the average monthly number of page views (traffic). This will be used to match female and male politicians in order to make them more comparable. Using this variable as matching variable is due to the hypothesis that “popularity” represents the primary confounder variable, when it comes to the length of texts and the number of edits. As a proxy variable, this measure ensures that the analysis only examines comparable men and women.

# deu_traffic <- get_traffic(legislature = "deu")
# deu_average_traffic <- traffic_metrics(deu_traffic)
# deu <- left_join(deu, select(deu_average_traffic, pageid, average_traffic), by = "pageid")
# 
# fra_traffic <- get_traffic(legislature = "fra")
# fra_average_traffic <- traffic_metrics(fra_traffic)
# fra <- left_join(fra, select(fra_average_traffic, pageid, average_traffic), by = "pageid")
# 
# #error
# gbr_traffic <- get_traffic(legislature = "gbr")
# gbr_average_traffic <- traffic_metrics(gbr_traffic)
# gbr$pageid <- as.character(gbr$pageid)
# gbr_average_traffic$pageid <- as.character(gbr_average_traffic$pageid)
# gbr <- left_join(gbr, select(gbr_average_traffic, pageid, average_traffic), by = "pageid")
# gbr$pageid <- as.numeric(gbr$pageid)
# 
# can_traffic <- get_traffic(legislature = "can")
# can_average_traffic <- traffic_metrics(can_traffic)
# can <- left_join(can, select(can_average_traffic, pageid, average_traffic), by = "pageid")
# 
# aut_traffic <- get_traffic(legislature = "aut")
# aut_average_traffic <- traffic_metrics(aut_traffic)
# aut <- left_join(aut, select(aut_average_traffic, pageid, average_traffic), by = "pageid")
# 
# 
# # introduces NAs exclusively, need to look into that
# esp_traffic <- get_traffic(legislature = "esp")
# esp_average_traffic <- traffic_metrics(esp_traffic)
# esp$pageid <- as.character(esp$pageid)
# esp_average_traffic$pageid <- as.character(esp_average_traffic$pageid)
# esp <- left_join(esp, select(esp_average_traffic, pageid, average_traffic), by = "pageid")
# esp$pageid <- as.numeric(esp$pageid)
# 
# cze_traffic <- get_traffic(legislature = "cze")
# cze_average_traffic <- traffic_metrics(cze_traffic)
# cze <- left_join(cze, select(cze_average_traffic, pageid, average_traffic), by = "pageid")
# 
# sco_traffic <- get_traffic(legislature = "sco")
# sco_average_traffic <- traffic_metrics(sco_traffic)
# sco <- left_join(sco, select(sco_average_traffic, pageid, average_traffic), by = "pageid")
# 
# irl_traffic <- get_traffic(legislature = "irl")
# irl_average_traffic <- traffic_metrics(irl_traffic)
# irl <- left_join(irl, select(irl_average_traffic, pageid, average_traffic), by = "pageid")
# 
# usa_house_traffic <- get_traffic(legislature = "usa_house")
# usa_senate_traffic <- get_traffic(legislature = "usa_senate")
# 
# usa_traffic <- bind_rows(usa_house_traffic, usa_senate_traffic)
# usa_average_traffic <- traffic_metrics(usa_traffic)
# usa <- left_join(usa, select(usa_average_traffic, pageid, average_traffic), by = "pageid")

Let’s get some insights on the average traffic variable for all the countries by looking at the top pages and creating a boxplot per country and per sex

# all_countries_traffic <- rbind(deu, cze, fra, usa, sco, irl, can, aut, esp, gbr)

#remove !is.na(sex)) as in some countries there are wikipedia pages of parties included, also, this analysis is based on a binary classification of gender for reasons of simplicity

# all_countries_traffic <- all_countries_traffic%>%
#   filter(!is.na(sex))
# 
# write.csv(all_countries_traffic, file = "clean_data/all_countries_traffic", row.names = FALSE)

all_countries_traffic <- read.csv("clean_data/all_countries_traffic")

First, let’s look at the top 3 politicians for average monthly traffic per country as a sanity check and to get an idea of the data:

all_countries_traffic %>%
  group_by(country) %>%
  arrange(desc(average_traffic)) %>%
  slice_head(n = 3)

Let’s plot the word count per gender/country to get an overview. i visualized this using a boxplot, the second one restricted the y axis, excluding some outliers, so that we can better see the quartiles’ distribution of the data.

ggplot(all_countries_traffic, aes(x = sex, y = average_traffic, color = sex)) +
  geom_boxplot() +
  facet_wrap(~country, scales = "free_y") +
  labs(title = "Distribution of Average Traffic per Country and Sex",
       x = "Sex",
       y = "Average Traffic") +
  scale_color_manual(values = c("male" = "blue", "female" = "pink")) +
  theme_minimal() +
  theme(legend.position = "none")
## Warning: Removed 1431 rows containing non-finite values (`stat_boxplot()`).

Zooming in, to be able to see the quartiles, leaving out some outliers:

#zoom in, focusing on quartiles, ignoring outliers
ggplot(all_countries_traffic, aes(x = sex, y = average_traffic, color = sex)) +
  geom_boxplot() +
  facet_wrap(~country, scales = "free_y") +
  labs(title = "Distribution of Average Traffic per Country and Sex (restricted Y-axis)",
       x = "Sex",
       y = "Average Traffic") +
  scale_color_manual(values = c("male" = "blue", "female" = "pink")) +
  theme_minimal() +
  theme(legend.position = "none") +
  ylim(1, 1500)  # Adjust the y-axis limits
## Warning: Removed 3356 rows containing non-finite values (`stat_boxplot()`).

Now, let’s have a look at the word counts for female and male politicians (for the further analysis, this data will be matched on confounding aspect of popularity)

#get word count

all_countries$word_count <- sapply(all_countries$plain_text, count_words)


avg_word_count <- all_countries %>%
  group_by(country, sex) %>%
  summarise(avg_word_count = mean(word_count))
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
ggplot(avg_word_count, aes(x = sex, y = avg_word_count, fill = sex)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ country, scales = "free_y") +
  labs(title = "Average Word Count per Sex in Each Country",
       x = "Sex",
       y = "Average Word Count") +
  scale_fill_manual(values = c("male" = "blue", "female" = "pink")) +
  theme_minimal() +
  theme(legend.position = "none")

Looking at the plots, the word count and the average number of traffic does not seem to be very different for males/females. Still, I suggest using average traffic as matching data to keep a balanced dataset for the analysis and exclude possible cofounders.